#include <stdlib.h>
#include "combine.h"

/* Combining functions */

char combine1_descr[] = "combine1: Maximum use of data abstraction";
/* $begin combine1 */
/* Implementation with maximum use of data abstraction */
void combine1(vec_ptr v, data_t *dest)
{
    long i;

    *dest = IDENT;
    for (i = 0; i < vec_length(v); i++) {
	data_t val;
	get_vec_element(v, i, &val);
	/* $begin combineline */
	*dest = *dest OP val;
	/* $end combineline */
    }
}
/* $end combine1 */

char combine2_descr[] = "combine2: Take vec_length() out of loop";
/* $begin combine2 */
/* Move call to vec_length out of loop */
void combine2(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);

    *dest = IDENT;
    for (i = 0; i < length; i++) {
	data_t val;
	get_vec_element(v, i, &val);
	*dest = *dest OP val;
    }
}
/* $end combine2 */

char combine3_descr[] = "combine3: Array reference to vector data";
/* $begin combine3 */
/* Direct access to vector data */
void combine3(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    data_t *data = get_vec_start(v);

    *dest = IDENT;
    for (i = 0; i < length; i++) {
	*dest = *dest OP data[i];
    }
}
/* $end combine3 */

char combine3w_descr[] = "combine3w: Update *dest within loop only with write";
/* $begin combine3w */
/* Make sure dest updated on each iteration */
void combine3w(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Initialize in event length <= 0 */
    *dest = acc; 

    for (i = 0; i < length; i++) {
	acc = acc OP data[i];
	*dest = acc;
    }
}
/* $end combine3w */

char combine4_descr[] = "combine4: Array reference, accumulate in temporary";
/* $begin combine4 */
/* Accumulate result in local variable */
void combine4(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    for (i = 0; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}
/* $end combine4 */

char combine4b_descr[] = "combine4b: Include bonds check in loop";
/* $begin combine4b */
/* Include bounds check in loop */
void combine4b(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    data_t acc = IDENT;

    for (i = 0; i < length; i++) {
	if (i >= 0 && i < v->len) {
	    acc = acc OP v->data[i];
	}
    }
    *dest = acc;
}
/* $end combine4b */


char combine4p_descr[] = "combine4p: Pointer reference, accumulate in temporary";
/* $begin combine4p */
/* Accumulate in local variable, pointer version */
void combine4p(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t *dend = data+length;
    data_t acc = IDENT;

    for (; data < dend; data++)
	acc = acc OP *data;
    *dest = acc;
}
/* $end combine4p */


char combine5_descr[] = "combine5: Array code, unrolled by 2";
/* $begin combine5 */
/* 2 x 1 loop unrolling */
void combine5(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-1;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 2 elements at a time */
    for (i = 0; i < limit; i+=2) {
      /* $begin combine5-update */
	acc = (acc OP data[i]) OP data[i+1];
      /* $end combine5-update */
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}
/* $end combine5 */

char unroll3a_descr[] = "unroll3a: Array code, unrolled by 3";
void unroll3a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-2;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 3 elements at a time */
    for (i = 0; i < limit; i+=3) {
	acc = acc OP data[i];
	acc = acc OP data[i+1];
	acc = acc OP data[i+2];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}


char combine5p_descr[] = "combine5p: Pointer code, unrolled by 2, for loop";
/* $begin combine5p */
/* 2 x 1 loop unrolling, pointer version */
void combine5p(vec_ptr v, data_t *dest)
{
    data_t *data = get_vec_start(v);
    data_t *dend = data+vec_length(v);
    data_t *dlimit = dend-1;
    data_t acc = IDENT;

    /* Combine 3 elements at a time */
    for (; data < dlimit; data += 2) {
	acc = acc OP data[0] OP data[1];
    }

    /* Finish any remaining elements */
    for (; data < dend; data++) {
	acc = acc OP data[0];
    }
    *dest = acc;
}
/* $end combine5p */

char unroll2aw_descr[] = "unroll2aw: Array code, unrolled by 2, while loop";
void unroll2aw_combine(vec_ptr v, data_t *dest)
{
    long i = 0;
    long length = vec_length(v);
    long limit = length-1;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 2 elements at a time */
    while (i < limit) {
	acc = acc OP data[i];
	i+= 2;
	acc = acc OP data[i-1];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}


char unroll4a_descr[] = "unroll4a: Array code, unrolled by 4";
void unroll4a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-3;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 4 elements at a time */
    for (i = 0; i < limit; i+=4) {
	acc = acc OP data[i] OP data[i+1];
	acc = acc OP data[i+2] OP data[i+3];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll5a_descr[] = "unroll5a: Array code, unrolled by 5";
#if 0
/* $begin unroll5 */
void unroll5(vec_ptr v, data_t *dest)
/* $end unroll5 */
#else
void unroll5a_combine(vec_ptr v, data_t *dest)
#endif
/* $begin unroll5 */
{
    long i;
    long length = vec_length(v);
    long limit = length-4;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 5 elements at a time */
    for (i = 0; i < limit; i+=5) {
	acc = acc OP data[i]   OP data[i+1];
	acc = acc OP data[i+2] OP data[i+3];
	acc = acc OP data[i+4];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}
/* $end unroll5 */

char unroll6a_descr[] = "unroll6a: Array code, unrolled by 6";
void unroll6a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-5;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 6 elements at a time */
    for (i = 0; i < limit; i+=6) {
	acc = acc OP data[i] OP data[i+1];
	acc = acc OP data[i+2] OP data[i+3];
	acc = acc OP data[i+4] OP data[i+5];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll7a_descr[] = "unroll7a: Array code, unrolled by 7";
void unroll7a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-6;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 7 elements at a time */
    for (i = 0; i < limit; i+=7) {
	acc = acc OP data[i] OP data[i+1];
	acc = acc OP data[i+2] OP data[i+3];
	acc = acc OP data[i+4] OP data[i+5];
	acc = acc OP data[i+6];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}


char unroll8a_descr[] = "unroll8a: Array code, unrolled by 8";
void unroll8a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-7;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 8 elements at a time */
    for (i = 0; i < limit; i+=8) {
	acc = acc OP data[i] OP data[i+1];
	acc = acc OP data[i+2] OP data[i+3];
	acc = acc OP data[i+4] OP data[i+5];
	acc = acc OP data[i+6] OP data[i+7];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll9a_descr[] = "unroll9a: Array code, unrolled by 9";
void unroll9a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-8;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 9 elements at a time */
    for (i = 0; i < limit; i+=9) {
	acc = acc OP data[i] OP data[i+1];
	acc = acc OP data[i+2] OP data[i+3];
	acc = acc OP data[i+4] OP data[i+5];
	acc = acc OP data[i+6] OP data[i+7];
	acc = acc OP data[i+8];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll10a_descr[] = "unroll10a: Array code, unrolled by 10";
void unroll10a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-9;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 10 elements at a time */
    for (i = 0; i < limit; i+=10) {
	acc = acc OP data[i] OP data[i+1];
	acc = acc OP data[i+2] OP data[i+3];
	acc = acc OP data[i+4] OP data[i+5];
	acc = acc OP data[i+6] OP data[i+7];
	acc = acc OP data[i+8] OP data[i+9];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}


char unroll16a_descr[] = "unroll16a: Array code, unrolled by 16";
void unroll16a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-15;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 16 elements at a time */
    for (i = 0; i < limit; i+=16) {
	acc = acc OP data[i] OP data[i+1];
	acc = acc OP data[i+2] OP data[i+3];
	acc = acc OP data[i+4] OP data[i+5];
	acc = acc OP data[i+6] OP data[i+7];
	acc = acc OP data[i+8] OP data[i+9];
	acc = acc OP data[i+10] OP data[i+11];
	acc = acc OP data[i+12] OP data[i+13];
	acc = acc OP data[i+14] OP data[i+15];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll2_descr[] = "unroll2: Pointer code, unrolled by 2";
void unroll2_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    int over = length%2;
    data_t *dend = data+length-over;
    data_t acc = IDENT;

    while (data < dend) {
	acc = acc OP data[0];
	acc = acc OP data[1];
	data += 2;
    }
    dend += over;
    while (data < dend) {
	acc = acc OP *data;
	data ++;
    }
    *dest = acc;
}


char unroll3_descr[] = "unroll3: Pointer code, unrolled by 3";
void unroll3_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t *dend = data+length-2;
    data_t acc = IDENT;

    while (data < dend) {
	acc = acc OP data[0];
	acc = acc OP data[1];
	acc = acc OP data[2];
	data += 3;
    }
    dend += 2;
    while (data < dend) {
	acc = acc OP *data;
	data ++;
    }
    *dest = acc;
}


char unroll4_descr[] = "unroll4: Pointer code, unrolled by 4";
void unroll4_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t *dend = data+length-3;
    data_t acc = IDENT;

    while (data < dend) {
	acc = acc OP data[0];
	acc = acc OP data[1];
	acc = acc OP data[2];
	acc = acc OP data[3];
	data += 4;
    }
    dend += 3;
    while (data < dend) {
	acc = acc OP *data;
	data ++;
    }
    *dest = acc;
}

char unroll8_descr[] = "unroll8: Pointer code, unrolled by 8";
void unroll8_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    int over = length%8;
    data_t *dend = data+length-over;
    data_t acc = IDENT;

    while (data < dend) {
	acc = acc OP data[0];
	acc = acc OP data[1];
	acc = acc OP data[2];
	acc = acc OP data[3];
	acc = acc OP data[4];
	acc = acc OP data[5];
	acc = acc OP data[6];
	acc = acc OP data[7];
	data += 8;
    }
    dend += over;
    while (data < dend) {
	acc = acc OP *data;
	data ++;
    }
    *dest = acc;
}

char unroll16_descr[] = "unroll16: Pointer code, unrolled by 16";
void unroll16_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    int over = length%16;
    data_t *dend = data+length-over;
    data_t acc = IDENT;

    while (data < dend) {
	acc = acc OP data[0];
	acc = acc OP data[1];
	acc = acc OP data[2];
	acc = acc OP data[3];
	acc = acc OP data[4];
	acc = acc OP data[5];
	acc = acc OP data[6];
	acc = acc OP data[7];
	acc = acc OP data[8];
	acc = acc OP data[9];
	acc = acc OP data[10];
	acc = acc OP data[11];
	acc = acc OP data[12];
	acc = acc OP data[13];
	acc = acc OP data[14];
	acc = acc OP data[15];
	data += 16;
    }
    dend += over;
    while (data < dend) {
	acc = acc OP *data;
	data ++;
    }
    *dest = acc;
}


char combine6_descr[] = "combine6: Array code, unrolled by 2, Superscalar x2";
/* $begin combine6 */
/* 2 x 2 loop unrolling */
void combine6(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-1;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;

    /* Combine 2 elements at a time */
    for (i = 0; i < limit; i+=2) {
	acc0 = acc0 OP data[i]; 
	acc1 = acc1 OP data[i+1];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = acc0 OP acc1;
}
/* $end combine6 */

char unroll4x2a_descr[] = "unroll4x2a: Array code, unrolled by 4, Superscalar x2";
void unroll4x2a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-3;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;

    /* Combine 4 elements at a time */
    for (i = 0; i < limit; i+=4) {
	acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i+1];
	acc0 = acc0 OP data[i+2]; acc1 = acc1 OP data[i+3];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = acc0 OP acc1;
}


char unroll8x2a_descr[] = "unroll8x2a: Array code, unrolled by 8, Superscalar x2";
void unroll8x2a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-7;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;

    /* Combine 8 elements at a time */
    for (i = 0; i < limit; i+=8) {
	acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i+1];
	acc0 = acc0 OP data[i+2]; acc1 = acc1 OP data[i+3];
	acc0 = acc0 OP data[i+4]; acc1 = acc1 OP data[i+5];
	acc0 = acc0 OP data[i+6]; acc1 = acc1 OP data[i+7];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = acc0 OP acc1;
}

char unroll3x3a_descr[] = "unroll3x3a: Array code, unrolled by 3, Superscalar x3";
void unroll3x3a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-2;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;

    /* Combine 4 elements at a time */
    for (i = 0; i < limit; i+=3) {
	acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = acc0 OP acc1 OP acc2;
}


char unroll4x4a_descr[] = "unroll4x4a: Array code, unrolled by 4, Superscalar x4";
void unroll4x4a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-3;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;

    /* Combine 4 elements at a time */
    for (i = 0; i < limit; i+=4) {
	acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = (acc0 OP acc1) OP (acc2 OP acc3);

}

char unroll8x4a_descr[] = "unroll8x4a: Array code, unrolled by 8, Superscalar x4";
void unroll8x4a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-7;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;

    /* Combine 8 elements at a time */
    for (i = 0; i < limit; i+=8) {
	acc0 = acc0 OP data[i];   acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3];
	acc0 = acc0 OP data[i+4]; acc1 = acc1 OP data[i+5];
	acc2 = acc2 OP data[i+6]; acc3 = acc3 OP data[i+7];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = acc0 OP acc1 OP acc2 OP acc3;
}

char unroll12x6a_descr[] = "unroll2x6a: Array code, unrolled by 12, Superscalar x6";
void unroll12x6a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-11;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;

    /* Combine 12 elements at a time */
    for (i = 0; i < limit; i+=12) {
	acc0 = acc0 OP data[i];
	acc0 = acc0 OP data[i+6]; 
	acc1 = acc1 OP data[i+1];
	acc1 = acc1 OP data[i+7];
	acc2 = acc2 OP data[i+2];
	acc2 = acc2 OP data[i+8]; 
	acc3 = acc3 OP data[i+3];
	acc3 = acc3 OP data[i+9];
	acc4 = acc4 OP data[i+4]; 
	acc4 = acc4 OP data[i+10]; 
	acc5 = acc5 OP data[i+5];
	acc5 = acc5 OP data[i+11];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = (acc0 OP acc1) OP (acc2 OP acc3) OP (acc4 OP acc5);
}

char unroll12x12a_descr[] = "unroll12x12a: Array code, unrolled by 12, Superscalar x12";
void unroll12x12a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-11;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;
    data_t acc6 = IDENT;
    data_t acc7 = IDENT;
    data_t acc8 = IDENT;
    data_t acc9 = IDENT;
    data_t acc10 = IDENT;
    data_t acc11= IDENT;

    /* Combine 12 elements at a time */
    for (i = 0; i < limit; i+=12) {
	acc0 = acc0 OP data[i];
	acc6 = acc6 OP data[i+6]; 
	acc1 = acc1 OP data[i+1];
	acc7 = acc7 OP data[i+7];
	acc2 = acc2 OP data[i+2];
	acc8 = acc8 OP data[i+8]; 
	acc3 = acc3 OP data[i+3];
	acc9 = acc9 OP data[i+9];
	acc4 = acc4 OP data[i+4]; 
	acc10 = acc10 OP data[i+10]; 
	acc5 = acc5 OP data[i+5];
	acc11 = acc11 OP data[i+11];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = (acc0 OP acc1) OP (acc2 OP acc3) OP (acc4 OP acc5)
	OP (acc6 OP acc7) OP (acc8 OP acc9) OP (acc10 OP acc11);
}

char unroll16x16a_descr[] = "unroll16x16a: Array code, unrolled by 16, Superscalar x16";
void unroll16x16a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-15;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;
    data_t acc6 = IDENT;
    data_t acc7 = IDENT;
    data_t acc8 = IDENT;
    data_t acc9 = IDENT;
    data_t acc10 = IDENT;
    data_t acc11= IDENT;
    data_t acc12= IDENT;
    data_t acc13= IDENT;
    data_t acc14= IDENT;
    data_t acc15= IDENT;

    /* Combine 16 elements at a time */
    for (i = 0; i < limit; i+=16) {
	acc0 = acc0 OP data[i];
	acc6 = acc6 OP data[i+6]; 
	acc1 = acc1 OP data[i+1];
	acc7 = acc7 OP data[i+7];
	acc2 = acc2 OP data[i+2];
	acc8 = acc8 OP data[i+8]; 
	acc3 = acc3 OP data[i+3];
	acc9 = acc9 OP data[i+9];
	acc4 = acc4 OP data[i+4]; 
	acc10 = acc10 OP data[i+10]; 
	acc5 = acc5 OP data[i+5];
	acc11 = acc11 OP data[i+11];
	acc12 = acc12 OP data[i+12];
	acc13 = acc13 OP data[i+13];
	acc14 = acc14 OP data[i+14];
	acc15 = acc15 OP data[i+15];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = ((acc0 OP acc1) OP (acc2 OP acc3) OP (acc4 OP acc5))
	OP ((acc6 OP acc7) OP (acc8 OP acc9) OP (acc10 OP acc11))
	OP ((acc12 OP acc13) OP (acc14 OP acc15));
}

char unroll20x20a_descr[] = "unroll20x20a: Array code, unrolled by 20, Superscalar x20";
void unroll20x20a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-19;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;
    data_t acc6 = IDENT;
    data_t acc7 = IDENT;
    data_t acc8 = IDENT;
    data_t acc9 = IDENT;
    data_t acc10 = IDENT;
    data_t acc11= IDENT;
    data_t acc12= IDENT;
    data_t acc13= IDENT;
    data_t acc14= IDENT;
    data_t acc15= IDENT;
    data_t acc16= IDENT;
    data_t acc17= IDENT;
    data_t acc18= IDENT;
    data_t acc19= IDENT;

    /* Combine 20 elements at a time */
    for (i = 0; i < limit; i+=20) {
	acc0 = acc0 OP data[i];
	acc6 = acc6 OP data[i+6]; 
	acc1 = acc1 OP data[i+1];
	acc7 = acc7 OP data[i+7];
	acc2 = acc2 OP data[i+2];
	acc8 = acc8 OP data[i+8]; 
	acc3 = acc3 OP data[i+3];
	acc9 = acc9 OP data[i+9];
	acc4 = acc4 OP data[i+4]; 
	acc10 = acc10 OP data[i+10]; 
	acc5 = acc5 OP data[i+5];
	acc11 = acc11 OP data[i+11];
	acc12 = acc12 OP data[i+12];
	acc13 = acc13 OP data[i+13];
	acc14 = acc14 OP data[i+14];
	acc15 = acc15 OP data[i+15];
	acc16 = acc16 OP data[i+16];
	acc17 = acc17 OP data[i+17];
	acc18 = acc18 OP data[i+18];
	acc19 = acc19 OP data[i+19];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest =
	(((acc0 OP acc1) OP (acc2 OP acc3) OP (acc4 OP acc5))
	 OP ((acc6 OP acc7) OP (acc8 OP acc9) OP (acc10 OP acc11)))
	OP
	(((acc12 OP acc13) OP (acc14 OP acc15)) OP ((acc16 OP acc17) OP (acc18 OP acc19)));
}


char unroll5x5a_descr[] = "unroll5x5a: Array code, unrolled by 5, Superscalar x5";
void unroll5x5a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-4;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;

    /* Combine 5 elements at a time */
    for (i = 0; i < limit; i+=5) {
	acc0 = acc0 OP data[i];   acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3];
	acc4 = acc4 OP data[i+4];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = (acc0 OP acc1) OP (acc2 OP acc3 OP acc4);
}

char unroll6x6a_descr[] = "unroll6x6a: Array code, unrolled by 6, Superscalar x6";
void unroll6x6a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-5;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;

    /* Combine 6 elements at a time */
    for (i = 0; i < limit; i+=6) {
	acc0 = acc0 OP data[i];   acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3];
	acc4 = acc4 OP data[i+4]; acc5 = acc5 OP data[i+5];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = (acc0 OP acc1) OP (acc2 OP acc3) OP (acc4 OP acc5);
}

char unroll7x7a_descr[] = "unroll7x7a: Array code, unrolled by 7, Superscalar x7";
void unroll7x7a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-6;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;
    data_t acc6 = IDENT;

    /* Combine 7 elements at a time */
    for (i = 0; i < limit; i+=7) {
	acc0 = acc0 OP data[i];   acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3];
	acc4 = acc4 OP data[i+4]; acc5 = acc5 OP data[i+5];
	acc6 = acc6 OP data[i+6];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = ((acc0 OP acc1) OP (acc2 OP acc3)) OP (acc4 OP acc5 OP acc6);
}

char unroll8x8a_descr[] = "unroll8x8a: Array code, unrolled by 8, Superscalar x8";
void unroll8x8a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-7;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;
    data_t acc6 = IDENT;
    data_t acc7 = IDENT;

    /* Combine 8 elements at a time */
    for (i = 0; i < limit; i+=8) {
	acc0 = acc0 OP data[i];   acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3];
	acc4 = acc4 OP data[i+4]; acc5 = acc5 OP data[i+5];
	acc6 = acc6 OP data[i+6]; acc7 = acc7 OP data[i+7];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = ((acc0 OP acc1) OP (acc2 OP acc3)) OP ((acc4 OP acc5) OP (acc6 OP acc7));
}

char unroll9x9a_descr[] = "unroll9x9a: Array code, unrolled by 9, Superscalar x9";
void unroll9x9a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-8;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;
    data_t acc6 = IDENT;
    data_t acc7 = IDENT;
    data_t acc8 = IDENT;

    /* Combine 9 elements at a time */
    for (i = 0; i < limit; i+=9) {
	acc0 = acc0 OP data[i];   acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3];
	acc4 = acc4 OP data[i+4]; acc5 = acc5 OP data[i+5];
	acc6 = acc6 OP data[i+6]; acc7 = acc7 OP data[i+7];
	acc8 = acc8 OP data[i+8];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = ((acc0 OP acc1) OP (acc2 OP acc3)) OP ((acc4 OP acc5) OP (acc6 OP acc7) OP acc8);
}

char unroll10x10a_descr[] = "unroll10x10a: Array code, unrolled by 10, Superscalar x10";
void unroll10x10a_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-9;
    data_t *data = get_vec_start(v);
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;
    data_t acc6 = IDENT;
    data_t acc7 = IDENT;
    data_t acc8 = IDENT;
    data_t acc9 = IDENT;

    /* Combine 10 elements at a time */
    for (i = 0; i < limit; i+=10) {
	acc0 = acc0 OP data[i];   acc1 = acc1 OP data[i+1];
	acc2 = acc2 OP data[i+2]; acc3 = acc3 OP data[i+3];
	acc4 = acc4 OP data[i+4]; acc5 = acc5 OP data[i+5];
	acc6 = acc6 OP data[i+6]; acc7 = acc7 OP data[i+7];
	acc8 = acc8 OP data[i+8]; acc9 = acc9 OP data[i+9];
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc0 = acc0 OP data[i];
    }
    *dest = ((acc0 OP acc1) OP (acc2 OP acc3)) OP
	((acc4 OP acc5) OP (acc6 OP acc7)) OP
	(acc8 OP acc9);
}

char unrollx2as_descr[] = "unrollx2as: Array code, Unroll x2, Superscalar x2, noninterleaved";
void unrollx2as_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length/2;
    data_t *data = get_vec_start(v);
    data_t *data2 = data+limit;
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;

    /* Combine 2 elements at a time */
    for (i = 0; i < limit; i++) {
	acc0 = acc0 OP data[i]; acc1 = acc1 OP data2[i];
    }

    /* Finish any remaining elements */
    for (i = limit*2; i < length; i++) {
	acc1 = acc1 OP data[i];
    }
    *dest = acc0 OP acc1;
}

char unroll4x2as_descr[] = "unroll4x2as: Array code, Unroll x4, Superscalar x2, noninterleaved";
void unroll4x2as_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length/2;
    data_t *data = get_vec_start(v);
    data_t *data2 = data+limit;
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;

    /* Combine 2 elements at a time */
    for (i = 0; i < limit; i++) {
	acc0 = acc0 OP data[i]; acc1 = acc1 OP data2[i];
    }

    /* Finish any remaining elements */
    for (i = limit*2; i < length; i++) {
	acc1 = acc1 OP data[i];
    }
    *dest = acc0 OP acc1;
}



char unroll8x2_descr[] = "unroll8x2: Pointer code, unrolled by 8, Superscalar x2";
void unroll8x2_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t *dend = data+length-7;
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;

    while (data < dend) {
	acc0 = acc0 OP data[0];
	acc1 = acc1 OP data[1];
	acc0 = acc0 OP data[2];
	acc1 = acc1 OP data[3];
	acc0 = acc0 OP data[4];
	acc1 = acc1 OP data[5];
	acc0 = acc0 OP data[6];
	acc1 = acc1 OP data[7];
	data += 8;
    }
    dend += 7;
    while (data < dend) {
	acc0 = acc0 OP *data;
	data ++;
    }
    *dest = acc0 OP acc1;
}

char unroll9x3_descr[] = "unroll9x3: Pointer code, unrolled by 9, Superscalar x3";
void unroll9x3_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t *dend = data+length-8;
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;

    while (data < dend) {
	acc0 = acc0 OP data[0];
	acc1 = acc1 OP data[1];
	acc2 = acc2 OP data[2];
	acc0 = acc0 OP data[3];
	acc1 = acc1 OP data[4];
	acc2 = acc2 OP data[5];
	acc0 = acc0 OP data[6];
	acc1 = acc1 OP data[7];
	acc2 = acc2 OP data[8];
	data += 9;
    }
    dend += 8;
    while (data < dend) {
	acc0 = acc0 OP *data;
	data ++;
    }
    *dest = acc0 OP acc1 OP acc2; 
}


char unroll8x4_descr[] = "unroll8x4: Pointer code, unrolled by 8, Superscalar x4";
void unroll8x4_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t *dend = data+length-7;
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;

    while (data < dend) {
	acc0 = acc0 OP data[0];
	acc1 = acc1 OP data[1];
	acc2 = acc2 OP data[2];
	acc3 = acc3 OP data[3];
	acc0 = acc0 OP data[4];
	acc1 = acc1 OP data[5];
	acc2 = acc2 OP data[6];
	acc3 = acc3 OP data[7];
	data += 8;
    }
    dend += 7;
    while (data < dend) {
	acc3 = acc3 OP *data;
	data ++;
    }
    *dest = acc3 OP acc0 OP acc1 OP acc2;
}

char unroll8x8_descr[] = "unroll8x8: Pointer code, unrolled by 8, Superscalar x8";
void unroll8x8_combine(vec_ptr v, data_t *dest)
{
    long length = vec_length(v);
    data_t *data = get_vec_start(v);
    data_t *dend = data+length-7;
    data_t acc0 = IDENT;
    data_t acc1 = IDENT;
    data_t acc2 = IDENT;
    data_t acc3 = IDENT;
    data_t acc4 = IDENT;
    data_t acc5 = IDENT;
    data_t acc6 = IDENT;
    data_t acc7 = IDENT;

    while (data < dend) {
	acc0 = acc0 OP data[0];
	acc1 = acc1 OP data[1];
	acc2 = acc2 OP data[2];
	acc3 = acc3  OP data[3];
	acc4 = acc4 OP data[4];
	acc5 = acc5 OP data[5];
	acc6 = acc6 OP data[6];
	acc7 = acc7 OP data[7];
	data += 8;
    }
    dend += 7;
    while (data < dend) {
	acc0 = acc0 OP *data;
	data ++;
    }
    *dest = acc0 OP acc1 OP acc2 OP acc3 OP acc4 OP acc5 OP acc6 OP acc7;
}

char combine7_descr[] = "combine7: Array code, unrolled by 2, different associativity";
/* $begin combine7 */
/* 2 x 1a loop unrolling */
void combine7(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-1;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 2 elements at a time */
    for (i = 0; i < limit; i+=2) {
      /* $begin combine7-update */
	acc = acc OP (data[i] OP data[i+1]);
      /* $end combine7-update */
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}
/* $end combine7 */

char unroll3aa_descr[] = "unroll3aa: Array code, unrolled by 3, Different Associativity";
void unroll3aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-2;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 3 elements at a time */
    for (i = 0; i < limit; i+=3) {
	acc = acc OP (data[i] OP data[i+1] OP data[i+2]);
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll4aa_descr[] = "unroll4aa: Array code, unrolled by 4, Different Associativity";
void unroll4aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-3;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 4 elements at a time */
    for (i = 0; i < limit; i+=4) {
	data_t t1 = data[i] OP data[i+1];
	data_t t2 = data[i+2] OP data[i+3];
	acc = acc OP (t1 OP t2); 
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll5aa_descr[] = "unroll5aa: Array code, unrolled by 5, Different Associativity";
void unroll5aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-4;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 5 elements at a time */
    for (i = 0; i < limit; i+=5) {
	data_t t1 = data[i] OP data[i+1];
	data_t t2 = data[i+2] OP data[i+3];
	data_t t3 = data[i+4];
	acc = acc OP (t1 OP t2 OP t3); 
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll6aa_descr[] = "unroll6aa: Array code, unrolled by 6, Different Associativity";
void unroll6aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-5;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 6 elements at a time */
    for (i = 0; i < limit; i+=6) {
	data_t t1 = data[i] OP data[i+1];
	data_t t2 = data[i+2] OP data[i+3];
	data_t t3 = data[i+4] OP data[i+5];
	acc = acc OP (t1 OP t2 OP t3); 
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }
    *dest = acc;
}

char unroll7aa_descr[] = "unroll7aa: Array code, unrolled by 7, Different Associativity";
void unroll7aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-6;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 7 elements at a time */
    for (i = 0; i < limit; i+=7) {
	data_t t1 = data[i] OP data[i+1];
	data_t t2 = data[i+2] OP data[i+3];
	data_t u1 = t1 OP t2;
	data_t t3 = data[i+4] OP data[i+5];
	data_t t4 = data[i+6];
	data_t u2 = t3 OP t4;
	acc = acc OP (u1 OP u2); 
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }

    *dest = acc;
}

char unroll8aa_descr[] = "unroll8aa: Array code, unrolled by 8, Different Associativity";
void unroll8aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-7;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 8 elements at a time */
    for (i = 0; i < limit; i+=8) {
	data_t t1 = data[i] OP data[i+1];
	data_t t2 = data[i+2] OP data[i+3];
	data_t u1 = t1 OP t2;
	data_t t3 = data[i+4] OP data[i+5];
	data_t t4 = data[i+6] OP data[i+7];
	data_t u2 = t3 OP t4;
	acc = acc OP (u1 OP u2); 
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }

    *dest = acc;
}

char unroll9aa_descr[] = "unroll9aa: Array code, unrolled by 9, Different Associativity";
void unroll9aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-8;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 9 elements at a time */
    for (i = 0; i < limit; i+=9) {
	data_t t1 = data[i] OP data[i+1];
	data_t t2 = data[i+2] OP data[i+3];
	data_t u1 = t1 OP t2;
	data_t t3 = data[i+4] OP data[i+5];
	data_t t4 = data[i+6] OP data[i+7];
	data_t t5 = data[i+8];
	data_t u2 = t3 OP t4 OP t5;
	acc = acc OP (u1 OP u2); 
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }

    *dest = acc;
}

char unroll10aa_descr[] = "unroll10aa: Array code, unrolled by 10, Different Associativity";
void unroll10aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-9;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 10 elements at a time */
    for (i = 0; i < limit; i+=10) {
	data_t t1 = data[i] OP data[i+1];
	data_t t2 = data[i+2] OP data[i+3];
	data_t u1 = t1 OP t2;
	data_t t3 = data[i+4] OP data[i+5];
	data_t t4 = data[i+6] OP data[i+7];
	data_t t5 = data[i+8] OP data[i+9];
	data_t u2 = t3 OP t4 OP t5;
	acc = acc OP (u1 OP u2); 
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }

    *dest = acc;
}


char unroll12aa_descr[] = "unroll12aa: Array code, unrolled by 12, Different Associativity";
void unroll12aa_combine(vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(v);
    long limit = length-11;
    data_t *data = get_vec_start(v);
    data_t acc = IDENT;

    /* Combine 12 elements at a time */
    for (i = 0; i < limit; i+=12) {
	data_t t1 = data[i] OP data[i+1];
	data_t t2 = data[i+2] OP data[i+3];
	data_t u1 = t1 OP t2;
	data_t t3 = data[i+4] OP data[i+5];
	data_t t4 = data[i+6] OP data[i+7];
	data_t u2 = t3 OP t4;
	data_t t5 = data[i+8] OP data[i+9];
	data_t t6 = data[i+10] OP data[i+11];
	data_t u3 = t5 OP t6;
	acc = acc OP (u1 OP u2 OP u3); 
    }

    /* Finish any remaining elements */
    for (; i < length; i++) {
	acc = acc OP data[i];
    }

    *dest = acc;
}

/* Experiment using GCC support for SSE instructions */

/* $begin simd_vec_sizes */
/* Number of bytes in a vector */
#define VBYTES 32

/* Number of elements in a vector */
#define VSIZE VBYTES/sizeof(data_t)
/* $end simd_vec_sizes */

/* $begin simd_vec_t */
/* Vector data type */
typedef data_t vec_t __attribute__ ((vector_size(VBYTES)));
/* $end simd_vec_t */

/* $begin simd_pack_t */
typedef union {
    vec_t v;
    data_t d[VSIZE];
} pack_t;
/* $end simd_pack_t */

char simd_v1_descr[] = "simd_v1: SSE code, 1*VSIZE-way parallelism";
/* $begin simd_combine-c */
void simd_v1_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    vec_t accum;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Initialize accum entries to IDENT */
    for (i = 0; i < VSIZE; i++) //line:opt:simd:initstart
	xfer.d[i] = IDENT;
    accum = xfer.v;             //line:opt:simd:initend
    /* $end simd_init-c */

    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {  //line:opt:simd:startstart
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    /* Step through data with VSIZE-way parallelism */
    while (cnt >= VSIZE) {    //line:opt:simd:loopstart
	vec_t chunk = *((vec_t *) data);
	accum = accum OP chunk;
	data += VSIZE;
	cnt -= VSIZE;       
    } //line:opt:simd:loopend

    /* Single-step through remaining elements */
    while (cnt) { //line:opt:simd:loopfinishstart
	result = result OP *data++;
	cnt--;  
    } //line:opt:simd:loopfinishend

    /* Combine elements of accumulator vector */
    xfer.v = accum; //line:opt:simd:finishstart
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i]; //line:opt:simd:finishend

    /* Store result */
    *dest = result; 
}
/* $end simd_combine-c */


char simd_v2_descr[] = "simd_v2: SSE code, 2*VSIZE-way parallelism";
void simd_v2_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    vec_t accum0, accum1;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Initialize to accum IDENT */
    for (i = 0; i < VSIZE; i++)
	xfer.d[i] = IDENT;
    accum0 = xfer.v;
    accum1 = xfer.v;

    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    while (cnt >= 2*VSIZE) {
	vec_t chunk0 = *((vec_t *) data);
	vec_t chunk1 = *((vec_t *) (data+VSIZE));
	accum0 = accum0 OP chunk0;
	accum1 = accum1 OP chunk1;
	data += 2*VSIZE;
	cnt -= 2*VSIZE;
    }
    while (cnt) {
	result = result OP *data++;
	cnt--;
    }
    xfer.v = accum0 OP accum1;
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i];
    *dest = result;
}

char simd_v4_descr[] = "simd_v4: SSE code, 4*VSIZE-way parallelism";
void simd_v4_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Create 4 accumulators and initialize elements to IDENT */
    vec_t accum0, accum1, accum2, accum3;
    for (i = 0; i < VSIZE; i++)
	xfer.d[i] = IDENT;
    accum0 = xfer.v; accum1 = xfer.v;
    accum2 = xfer.v; accum3 = xfer.v;
    
    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    /* $begin simd_v4_loop-c */
    /* 4 * VSIZE x 4 * VSIZE unrolling */
    while (cnt >= 4*VSIZE) {
	vec_t chunk0 = *((vec_t *) data);
	vec_t chunk1 = *((vec_t *) (data+VSIZE));
	vec_t chunk2 = *((vec_t *) (data+2*VSIZE));
	vec_t chunk3 = *((vec_t *) (data+3*VSIZE));
	accum0 = accum0 OP chunk0;
	accum1 = accum1 OP chunk1;
	accum2 = accum2 OP chunk2;
	accum3 = accum3 OP chunk3;
	data += 4*VSIZE;
	cnt -= 4*VSIZE;
    }
    /* $end simd_v4_loop-c */

    while (cnt) {
	result = result OP *data++;
	cnt--;
    }

    /* $begin simd_v4_accum-c */
    /* Combine into single accumulator */
    xfer.v = (accum0 OP accum1) OP (accum2 OP accum3);

    /* Combine results from accumulators within vector */
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i];
    /* $end simd_v4_accum-c */
    *dest = result;
}

char simd_v8_descr[] = "simd_v8: SSE code, 8*VSIZE-way parallelism";
void simd_v8_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    vec_t accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Initialize to accum IDENT */
    for (i = 0; i < VSIZE; i++)
	xfer.d[i] = IDENT;
    accum0 = xfer.v;
    accum1 = xfer.v;
    accum2 = xfer.v;
    accum3 = xfer.v;
    accum4 = xfer.v;
    accum5 = xfer.v;
    accum6 = xfer.v;
    accum7 = xfer.v;
    
    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    while (cnt >= 8*VSIZE) {
	vec_t chunk0 = *((vec_t *) data);
	vec_t chunk1 = *((vec_t *) (data+VSIZE));
	vec_t chunk2 = *((vec_t *) (data+2*VSIZE));
	vec_t chunk3 = *((vec_t *) (data+3*VSIZE));
	vec_t chunk4 = *((vec_t *) (data+4*VSIZE));
	vec_t chunk5 = *((vec_t *) (data+5*VSIZE));
	vec_t chunk6 = *((vec_t *) (data+6*VSIZE));
	vec_t chunk7 = *((vec_t *) (data+7*VSIZE));
	accum0 = accum0 OP chunk0;
	accum1 = accum1 OP chunk1;
	accum2 = accum2 OP chunk2;
	accum3 = accum3 OP chunk3;
	accum4 = accum4 OP chunk4;
	accum5 = accum5 OP chunk5;
	accum6 = accum6 OP chunk6;
	accum7 = accum7 OP chunk7;
	data += 8*VSIZE;
	cnt -= 8*VSIZE;
    }
    while (cnt) {
	result = result OP *data++;
	cnt--;
    }
    xfer.v = (accum0 OP accum1) OP (accum2 OP accum3);
    xfer.v = xfer.v OP (accum4 OP accum5) OP (accum6 OP accum7);
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i];
    *dest = result;
}

char simd_v10_descr[] = "simd_v10: SSE code, 10*VSIZE-way parallelism";
void simd_v10_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    vec_t accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7, accum8, accum9;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Initialize to accum IDENT */
    for (i = 0; i < VSIZE; i++)
	xfer.d[i] = IDENT;
    accum0 = xfer.v;
    accum1 = xfer.v;
    accum2 = xfer.v;
    accum3 = xfer.v;
    accum4 = xfer.v;
    accum5 = xfer.v;
    accum6 = xfer.v;
    accum7 = xfer.v;
    accum8 = xfer.v;
    accum9 = xfer.v;
    
    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    while (cnt >= 10*VSIZE) {
	vec_t chunk0 = *((vec_t *) data);
	vec_t chunk1 = *((vec_t *) (data+VSIZE));
	vec_t chunk2 = *((vec_t *) (data+2*VSIZE));
	vec_t chunk3 = *((vec_t *) (data+3*VSIZE));
	vec_t chunk4 = *((vec_t *) (data+4*VSIZE));
	vec_t chunk5 = *((vec_t *) (data+5*VSIZE));
	vec_t chunk6 = *((vec_t *) (data+6*VSIZE));
	vec_t chunk7 = *((vec_t *) (data+7*VSIZE));
	vec_t chunk8 = *((vec_t *) (data+8*VSIZE));
	vec_t chunk9 = *((vec_t *) (data+9*VSIZE));
	accum0 = accum0 OP chunk0;
	accum1 = accum1 OP chunk1;
	accum2 = accum2 OP chunk2;
	accum3 = accum3 OP chunk3;
	accum4 = accum4 OP chunk4;
	accum5 = accum5 OP chunk5;
	accum6 = accum6 OP chunk6;
	accum7 = accum7 OP chunk7;
	accum8 = accum8 OP chunk8;
	accum9 = accum9 OP chunk9;
	data += 10*VSIZE;
	cnt -= 10*VSIZE;
    }
    while (cnt) {
	result = result OP *data++;
	cnt--;
    }
    xfer.v = (accum0 OP accum1) OP (accum2 OP accum3);
    xfer.v = xfer.v OP (accum4 OP accum5) OP (accum6 OP accum7);
    xfer.v = xfer.v OP (accum8 OP accum9);
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i];
    *dest = result;
}

char simd_v12_descr[] = "simd_v12: SSE code, 12*VSIZE-way parallelism";
void simd_v12_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    vec_t accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7;
    vec_t accum8, accum9, accum10, accum11;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Initialize to accum IDENT */
    for (i = 0; i < VSIZE; i++)
	xfer.d[i] = IDENT;
    accum0 = xfer.v;
    accum1 = xfer.v;
    accum2 = xfer.v;
    accum3 = xfer.v;
    accum4 = xfer.v;
    accum5 = xfer.v;
    accum6 = xfer.v;
    accum7 = xfer.v;
    accum8 = xfer.v;
    accum9 = xfer.v;
    accum10 = xfer.v;
    accum11 = xfer.v;

    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    while (cnt >= 12*VSIZE) {
	vec_t chunk0 = *((vec_t *) data);
	vec_t chunk1 = *((vec_t *) (data+VSIZE));
	vec_t chunk2 = *((vec_t *) (data+2*VSIZE));
	vec_t chunk3 = *((vec_t *) (data+3*VSIZE));
	vec_t chunk4 = *((vec_t *) (data+4*VSIZE));
	vec_t chunk5 = *((vec_t *) (data+5*VSIZE));
	vec_t chunk6 = *((vec_t *) (data+6*VSIZE));
	vec_t chunk7 = *((vec_t *) (data+7*VSIZE));
	vec_t chunk8 = *((vec_t *) (data+8*VSIZE));
	vec_t chunk9 = *((vec_t *) (data+9*VSIZE));
	vec_t chunk10 = *((vec_t *) (data+10*VSIZE));
	vec_t chunk11 = *((vec_t *) (data+11*VSIZE));
	accum0 = accum0 OP chunk0;
	accum1 = accum1 OP chunk1;
	accum2 = accum2 OP chunk2;
	accum3 = accum3 OP chunk3;
	accum4 = accum4 OP chunk4;
	accum5 = accum5 OP chunk5;
	accum6 = accum6 OP chunk6;
	accum7 = accum7 OP chunk7;
	accum8 = accum8 OP chunk8;
	accum9 = accum9 OP chunk9;
	accum10 = accum10 OP chunk10;
	accum11 = accum11 OP chunk11;
	data += 12*VSIZE;
	cnt -= 12*VSIZE;
    }
    while (cnt) {
	result = result OP *data++;
	cnt--;
    }
    xfer.v = (accum0 OP accum1) OP (accum2 OP accum3);
    xfer.v = xfer.v OP (accum4 OP accum5) OP (accum6 OP accum7);
    xfer.v = xfer.v OP (accum8 OP accum9) OP (accum10 OP accum11);
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i];
    *dest = result;
}

/* Same idea, but use different associativity to get parallelism */
char simd_v2a_descr[] = "simd_v2a: SSE code, 2*VSIZE-way parallelism, reassociate";
void simd_v2a_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    vec_t accum;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Initialize accum to IDENT */
    for (i = 0; i < VSIZE; i++)
	xfer.d[i] = IDENT;
    accum = xfer.v;

    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    while (cnt >= 2*VSIZE) {
	vec_t chunk0 = *((vec_t *) data);
	vec_t chunk1 = *((vec_t *) (data+VSIZE));
	accum = accum OP (chunk0 OP chunk1);
	data += 2*VSIZE;
	cnt -= 2*VSIZE;
    }
    while (cnt) {
	result = result OP *data++;
	cnt--;
    }
    xfer.v = accum;
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i];
    *dest = result;
}

char simd_v4a_descr[] = "simd_v4a: SSE code, 4*VSIZE-way parallelism, reassociate";
void simd_v4a_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    vec_t accum;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Initialize accum to IDENT */
    for (i = 0; i < VSIZE; i++)
	xfer.d[i] = IDENT;
    accum = xfer.v;

    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    /* $begin simd_v4a_loop-c */
    while (cnt >= 4*VSIZE) {
	vec_t chunk0 = *((vec_t *) data);
	vec_t chunk1 = *((vec_t *) (data+VSIZE));
	vec_t chunk2 = *((vec_t *) (data+2*VSIZE));
	vec_t chunk3 = *((vec_t *) (data+3*VSIZE));
	accum = accum OP
	    ((chunk0 OP chunk1) OP (chunk2 OP chunk3));
	data += 4*VSIZE;
	cnt -= 4*VSIZE;
    }
    /* $end simd_v4a_loop-c */

    while (cnt) {
	result = result OP *data++;
	cnt--;
    }
    xfer.v = accum;
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i];
    *dest = result;
}

char simd_v8a_descr[] = "simd_v8a: SSE code, 8*VSIZE-way parallelism, reassociate";
void simd_v8a_combine(vec_ptr v, data_t *dest)
{
    long i;
    pack_t xfer;
    vec_t accum;
    data_t *data = get_vec_start(v);
    int cnt = vec_length(v);
    data_t result = IDENT;

    /* Initialize accum to IDENT */
    for (i = 0; i < VSIZE; i++)
	xfer.d[i] = IDENT;
    accum = xfer.v;

    /* Single step until have memory alignment */
    while ((((size_t) data) % VBYTES) != 0 && cnt) {
	result = result OP *data++; 
	cnt--;                              
    }                              //line:opt:simd:startend

    while (cnt >= 8*VSIZE) {
	vec_t chunk0 = *((vec_t *) data);
	vec_t chunk1 = *((vec_t *) (data+VSIZE));
	vec_t chunk2 = *((vec_t *) (data+2*VSIZE));
	vec_t chunk3 = *((vec_t *) (data+3*VSIZE));
	vec_t chunk4 = *((vec_t *) (data+4*VSIZE));
	vec_t chunk5 = *((vec_t *) (data+5*VSIZE));
	vec_t chunk6 = *((vec_t *) (data+6*VSIZE));
	vec_t chunk7 = *((vec_t *) (data+7*VSIZE));
	accum = accum OP
	    (((chunk0 OP chunk1) OP (chunk2 OP chunk3))
	     OP
	     ((chunk4 OP chunk5) OP (chunk6 OP chunk7)));
	data += 8*VSIZE;
	cnt -= 8*VSIZE;
    }
    while (cnt) {
	result = result OP *data++;
	cnt--;
    }
    xfer.v = accum;
    for (i = 0; i < VSIZE; i++)
	result = result OP xfer.d[i];
    *dest = result;
}


void register_combiners(void)
{
#ifndef UNROLL_ONLY
    add_combiner(combine1, combine1, combine1_descr);
    add_combiner(combine2, combine1, combine2_descr);
    add_combiner(combine3, combine1, combine3_descr);
    add_combiner(combine3w, combine1, combine3w_descr);
#endif
    add_combiner(combine4, combine1,combine4_descr);
#ifndef UNROLL_ONLY
    add_combiner(combine4b, combine1, combine4b_descr);
    add_combiner(combine4p, combine1, combine4p_descr);
#endif
    add_combiner(combine5, combine1, combine5_descr);
#ifndef UNROLL_ONLY
    add_combiner(combine5p, combine1, combine5p_descr);
    add_combiner(unroll2aw_combine, combine1, unroll2aw_descr);
#endif
    add_combiner(unroll3a_combine, combine1, unroll3a_descr);
    add_combiner(unroll4a_combine, combine1, unroll4a_descr);
    add_combiner(unroll5a_combine, combine1, unroll5a_descr);
    add_combiner(unroll6a_combine, combine1, unroll6a_descr);
    add_combiner(unroll7a_combine, combine1, unroll7a_descr);
    add_combiner(unroll8a_combine, combine1, unroll8a_descr);
    add_combiner(unroll9a_combine, combine1, unroll9a_descr);
    add_combiner(unroll10a_combine, combine1, unroll10a_descr);
    add_combiner(unroll16a_combine, combine1, unroll16a_descr);
#ifndef UNROLL_ONLY
    add_combiner(unroll2_combine, combine1, unroll2_descr);
    add_combiner(unroll3_combine, combine1, unroll3_descr);
    add_combiner(unroll4_combine, combine1, unroll4_descr);
    add_combiner(unroll8_combine, combine1, unroll8_descr);
    add_combiner(unroll16_combine, combine1, unroll16_descr);
    add_combiner(combine6, combine1, combine6_descr);
    add_combiner(unroll4x2a_combine, combine1, unroll4x2a_descr);
    add_combiner(unroll8x2a_combine, combine1, unroll8x2a_descr);
    add_combiner(unroll3x3a_combine, combine1, unroll3x3a_descr);
    add_combiner(unroll4x4a_combine, combine1, unroll4x4a_descr);
    add_combiner(unroll5x5a_combine, combine1, unroll5x5a_descr);
    add_combiner(unroll6x6a_combine, combine1, unroll6x6a_descr);
    add_combiner(unroll7x7a_combine, combine1, unroll7x7a_descr);
    add_combiner(unroll8x4a_combine, combine1, unroll8x4a_descr);
    add_combiner(unroll8x8a_combine, combine1, unroll8x8a_descr);
    add_combiner(unroll9x9a_combine, combine1, unroll9x9a_descr);
    add_combiner(unroll10x10a_combine, combine1, unroll10x10a_descr);
    add_combiner(unroll12x6a_combine, combine1, unroll12x6a_descr);
    add_combiner(unroll12x12a_combine, combine1, unroll12x12a_descr);
    add_combiner(unroll16x16a_combine, combine1, unroll16x16a_descr);
    add_combiner(unroll20x20a_combine, combine1, unroll20x20a_descr);
    add_combiner(unroll8x2_combine, combine1, unroll8x2_descr);
    add_combiner(unroll8x4_combine, combine1, unroll8x4_descr);
    add_combiner(unroll8x8_combine, combine1, unroll8x8_descr);
    add_combiner(unroll9x3_combine, combine1, unroll9x3_descr);
    add_combiner(unrollx2as_combine, combine1, unrollx2as_descr);
    add_combiner(combine7, combine1, combine7_descr);
    add_combiner(unroll3aa_combine, combine1, unroll3aa_descr);
    add_combiner(unroll4aa_combine, combine1, unroll4aa_descr);
    add_combiner(unroll5aa_combine, combine1, unroll5aa_descr);
    add_combiner(unroll6aa_combine, combine1, unroll6aa_descr);
    add_combiner(unroll7aa_combine, combine1, unroll7aa_descr);
    add_combiner(unroll8aa_combine, combine1, unroll8aa_descr);
    add_combiner(unroll9aa_combine, combine1, unroll9aa_descr);
    add_combiner(unroll10aa_combine, combine1, unroll10aa_descr);
    add_combiner(unroll12aa_combine, combine1, unroll12aa_descr);
    add_combiner(simd_v1_combine, combine1, simd_v1_descr);
    add_combiner(simd_v2_combine, combine1, simd_v2_descr);
    add_combiner(simd_v4_combine, combine1, simd_v4_descr);
    add_combiner(simd_v8_combine, combine1, simd_v8_descr);
    add_combiner(simd_v10_combine, combine1, simd_v10_descr);
    add_combiner(simd_v12_combine, combine1, simd_v12_descr);
    add_combiner(simd_v2a_combine, combine1, simd_v2a_descr);
    add_combiner(simd_v4a_combine, combine1, simd_v4a_descr);
    add_combiner(simd_v8a_combine, combine1, simd_v8a_descr);
    log_combiner(simd_v8a_combine, 0.16, 0.24);
#endif
}







